# Plot relationship between loan characteristicredit_score and realized prepayment

library(tidyverse)
library(haven)
library(knitr)
library(lubridate)
library(grid)
library(gridExtra)
library(lfe)

opts_chunk$set(warning = FALSE, message = FALSE)
options(knitr.table.format = "markdown")
# To turnoff greying in dplyr
options(pillar.subtle = FALSE)


currdir <- file.path(getwd())
rootdir <- dirname(currdir)
gse_iddir <- file.path(rootdir, "data", "scratch_data", "GSEdata")
tmpdir <- file.path(rootdir, "data", "scratch_data", "temp")
embs2dir <- file.path(rootdir, "data", "scratch_data", "embs", "cached")
outdir <- file.path(rootdir, "TEX", "Figures")

# This is the last performance date for HARP loans (GSE data ends slightly later)
last_perf_dt <- as.Date("2017-08-01")

# Use gse_id-static file since we need seller_id_name
loan_perf <- read_csv(file.path(gse_iddir, "gse_id-static.csv"),
                      col_types = paste0(paste(rep("d", 6), collapse = ""),
                                         "ccdd",
                                         paste(rep("c", 7), collapse = ""), "d"))

harp_orig <- read_dta(file.path(tmpdir, "gse_id-harp-temp.dta"))
harp_perf <- read_dta(file.path(embs2dir, "harp-performance-combined.dta"))

loan_perf <- loan_perf %>%
  filter(purpose_type_id == "P" & prop_type_id == "SF" & occupancy_type_id == "P") %>%
  mutate(loan_close_date = as.Date(paste0(loan_close_date, "01"), "%Y%m%d")) %>%
  mutate(zero_balance_date = as.Date(paste0(zero_balance_date, "01"), "%Y%m%d")) %>%
  mutate(paydown_mnth = (year(zero_balance_date) - year(loan_close_date)) * 12 +
           month(zero_balance_date) - month(loan_close_date)) %>%
  filter(orig_loan_term == 360 & year(loan_close_date) >= 2009 & year(loan_close_date) <= 2012) %>%
  # round orig_int_rate to nearest 0.25
  mutate(orig_int_rate = round(orig_int_rate/0.25) * 0.25)

# Need to get ym to match HARP data
# Baseline is Jan 2010, which matches to 600
loan_perf <- loan_perf %>%
  select(loan_close_date, loan_to_value_orig, orig_loan_amt, gse_id, mcdash_id, paydown_mnth, 
         orig_int_rate, prop_state_id, seller_id_name) %>%
  mutate(agency_id = ifelse(gse_id == 0, "FHL", "FNM")) %>%
  select(-gse_id) %>%
  mutate(orig_ym = (year(loan_close_date) - 2010) * 12 + month(loan_close_date) - 1 + 600)

# Get prepayment by months
# 0 is not prepaid, 1 is prepaid, NA is indeterminate (truncated)
add_gse_id_prepaid <- function(N){
  varname <- paste0("prepaid", N)
  prepaid_vec <- loan_perf %>%
    mutate(!!varname := case_when(loan_close_date > last_perf_dt %m-% months(N) ~ as.integer(NA),
                               is.na(paydown_mnth) ~ 0L,
                               paydown_mnth > N ~ 0L,
                               paydown_mnth <= N ~ 1L)) %>%
    select(!!varname)
  return(prepaid_vec)
}
foo <- map_dfc(seq(12, 96, 12), add_gse_id_prepaid)

loan_perf <- cbind(loan_perf, foo) %>%
  select(loan_to_value_orig, orig_loan_amt, agency_id, orig_ym, starts_with("prepaid"),
         orig_int_rate, prop_state_id, seller_id_name) %>%
  mutate(loan_type_id = "GSE")


# Clean HARP data
# Restrict to HARP 2.0, defined as originated in Jan 2012 or after (612)
harp_orig <- harp_orig %>%
  filter(orig_ym >= 612) %>%
  filter(occpy_status == "O" | occ_stat == "P") %>%
  filter(prop_type_id == "SF") %>%
  select(id_loan_harp, orig_amount, oltv, orig_ym, agency_id, orig_rt, zip_3, seller_id_name,
         credit_scorecore_b) %>%
  rename(orig_int_rate = orig_rt) %>%
  mutate(orig_int_rate = round(orig_int_rate/0.25) * 0.25)

# This join loses some observations
harp_all <- inner_join(harp_orig, harp_perf) %>%
  rename(orig_loan_amt = orig_amount,
         loan_to_value_orig = oltv)

# Transform the variables in harp_all
# 0 is not prepaid, 1 is prepaid, NA is indeterminate (truncated)
add_harp_prepaid <- function(N){
  upb <- harp_all[[paste0("current_upb", N)]]
  prepaid_vec <- 1 - (upb > 0)
  # If prepaid == 1 and upb = NA, then prepaid
  prepaid_vec[(harp_all$prepaid == 1 & is.na(upb))] <- 1

  out_ds <- tibble(tmp = prepaid_vec)
  out_ds[[paste0("prepaid", N)]] <- prepaid_vec
  return(out_ds %>% select(-tmp))
}
foo <- map_dfc(seq(12, 96, 12), add_harp_prepaid)

harp_all <- cbind(harp_all, foo) %>%
  mutate(agency_id = ifelse(agency_id == 1, "FNM", "FHL")) %>%
  select(orig_loan_amt, loan_to_value_orig, agency_id, orig_ym, starts_with("prepaid"),
         orig_int_rate, zip_3, seller_id_name, credit_scorecore_b) %>%
  select(-prepaid) %>%
  mutate(loan_type_id = "HARP")


## For LTV, we use HARP data only
cal_prepay_ltv2 <- function(N){
  tmp <- harp_all %>%
    filter(loan_to_value_orig <= 125 & loan_to_value_orig >= 15) %>%
    mutate(ym_zip3_seller_id = paste0("yr", orig_ym, "_", zip_3, "_", seller_id_name),
           ym_int_rate = paste0("ym", orig_ym, "_", orig_int_rate)) %>%
    mutate(ym = as.factor(orig_ym),
           zip_3 = as.factor(zip_3))
  regdata <- tmp[, c("loan_to_value_orig", "agency_id", "ym_int_rate", "ym", "zip_3", "seller_id_name")]
  regdata$prepaid <- tmp[[paste0("prepaid", N)]]
  regdata <- regdata %>% filter(!is.na(prepaid))

  # Controlling for month f.e. and gse_id
  lmout <- felm(prepaid ~ agency_id | ym_int_rate + ym + zip_3 + seller_id_name, data = regdata)

  resd_data <- tibble(loan_to_value_orig = regdata$loan_to_value_orig,
                      res = as.vector(lmout$residuals))

  avg_res <- resd_data %>%
    mutate(loan_to_value_orig = ceiling(loan_to_value_orig/5)*5) %>%
    group_by(loan_to_value_orig) %>%
    summarize(avg_prepaid_res = mean(res)) %>%
    ungroup()

  p1 <- ggplot(avg_res, aes(loan_to_value_orig, avg_prepaid_res)) +
    geom_line() +
    geom_vline(xintercept = 105, linetype = "dashed", color = "blue") +
    theme_bw() + scale_color_hue(l = 30) +
    xlab("LTV") + 
    theme(axis.title.y = element_blank(),
          plot.background = element_rect(fill = "gray92"))

  return(p1)
}

## Loan size: use loan_perf data

inc <- 5000

cal_prepay_loanamt2 <- function(N){
  tmp <- loan_perf %>%
    filter(orig_loan_amt > 17500) %>%
    filter(orig_loan_amt <= 630000) %>%
    mutate(loanamt_round = round(orig_loan_amt/inc) * inc) %>%
    mutate(ym_state_id_seller_id = paste0("yr", orig_ym, "_", prop_state_id, "_", seller_id_name),
           ym_int_rate = paste0("ym", orig_ym, "_", orig_int_rate)) %>%
    mutate(loanamt_round = loanamt_round/1000 - 417) %>%
    mutate(ym = as.factor(orig_ym),
           prop_state_id = as.factor(prop_state_id))

  regdata <- tmp[, c("loanamt_round", "agency_id", "ym_int_rate", "ym", "prop_state_id", "seller_id_name")]
  regdata$prepaid <- tmp[[paste0("prepaid", N)]]
  regdata <- regdata %>% filter(!is.na(prepaid))

  # Controlling for month f.e. and gse_id
  lmout <- felm(prepaid ~ agency_id | ym_int_rate + ym + prop_state_id + seller_id_name, data = regdata)

  resd_data <- tibble(loanamt_round = regdata$loanamt_round,
                      res = as.vector(lmout$residuals))

  avg_res <- resd_data %>%
    group_by(loanamt_round) %>%
    summarize(avg_prepaid_res = mean(res)) %>%
    ungroup()

  p <- ggplot(avg_res, aes(loanamt_round, avg_prepaid_res)) +
    geom_line() +
    theme_bw() + scale_color_hue(l = 30) +
    geom_vline(xintercept = 0, linetype = "dashed", color = "blue") +
    xlab("Loan Size \u2013 CLL ($1,000)") + 
    theme(axis.title.y = element_blank(),
          plot.background = element_rect(fill = "gray92"))

  return(p)
}

prepay_ltv <- cal_prepay_ltv2(36)
prepay_loanamt <- cal_prepay_loanamt2(36)

ggsave(file.path(outdir, "20190923_loan_prepayment_36mon_ltv.pdf"),
       width = 4, height = 4, plot = prepay_ltv, device = cairo_pdf)
ggsave(file.path(outdir, "20190923_loan_prepayment_36mon_loanamt.pdf"),
       width = 4, height = 4, plot = prepay_loanamt, device = cairo_pdf)

